# Notice that age 0 stands for NA
data$vict_age[data$vict_age == 0] <- NA

# age distribution
ggplot(data, aes(x = vict_age)) +
  geom_histogram(binwidth = 5, fill = "#433E85FF", color = "black", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Age Distribution of Victims", x = "Age", y = "Frequency") 

# divide age into four categories
data$age_group <- cut(
  data$vict_age,
  breaks = c(-Inf, 18, 40, 60, Inf),
  labels = c("juvenile", "Young adult", "Middle-aged people", "The elderly"),
  right = FALSE
)

#
ggplot(data[!is.na(data$vict_age), ], aes(x = age_group)) +
  geom_bar(fill = "#25858EFF", color = "black", alpha = 0.7) +
  theme_minimal() +
  labs(
    title = "Age Group Distribution of Victims",
    x = "Age Group",
    y = "Count"
  )

# Crime Severity Distribution by Age Group
data$severity_label <- ifelse(data$part_1_2 == 1, "Serious", "Less Serious")

# Significant test: the relationship between different age groups and crime severity
# turn severity into factor
data$severity_label <- as.factor(data$severity_label)

# Chi-squre test
severity_age_table <- table(data$age_group, data$severity_label)
chisq_test <- chisq.test(severity_age_table)
print(chisq_test)
## 
##  Pearson's Chi-squared test
## 
## data:  severity_age_table
## X-squared = 4762.7, df = 3, p-value < 2.2e-16
# output
if (chisq_test$p.value < 0.05) {
  print("Age group has a statistically significant relationship with crime severity.")
} else {
  print("No significant relationship between age group and crime severity.")
}
## [1] "Age group has a statistically significant relationship with crime severity."
ggplot(data[!is.na(data$vict_age), ], aes(x = age_group, fill = severity_label)) +
  geom_bar(position = "fill", alpha = 0.7) +
  theme_minimal() +
  labs(
    title = "Crime Severity Distribution by Age Group",
    x = "Age Group",
    y = "Proportion",
    fill = "Crime Severity"
  )

# Calculate the proportion of crime severity for each gender
# Filter out rows where vict_sex is "-" or NA
clean_data <- data[!is.na(data$vict_sex) & data$vict_sex != "-", ]

# Recode gender codes with clearer labels
clean_data <- clean_data %>%
  mutate(gender_label = recode(vict_sex,
                               "F" = "Female",
                               "M" = "Male",
                               "H" = "Intersex/Other",
                               "X" = "Unknown"))

# Calculate the proportion of crime severity for each gender
severity_gender_data <- clean_data %>%
  group_by(severity_label, gender_label) %>%
  summarise(count = n(), .groups = "drop") %>%
  complete(severity_label, gender_label, fill = list(count = 0)) %>%
  group_by(gender_label) %>%
  mutate(percentage = count / sum(count) * 100)

severity_gender_table <- severity_gender_data %>%
  arrange(gender_label, desc(percentage))

# Display the table
kable(severity_gender_table, format = "html", caption = "Crime Severity by Gender and Percentage") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Crime Severity by Gender and Percentage
severity_label gender_label count percentage
Less Serious Female 197898 56.10149
Serious Female 154852 43.89851
Serious Intersex/Other 70 62.50000
Less Serious Intersex/Other 42 37.50000
Serious Male 237003 59.73325
Less Serious Male 159766 40.26675
Serious Unknown 57399 60.70050
Less Serious Unknown 37162 39.29950
# Plot the pie chart with values annotated
ggplot(severity_gender_data, aes(x = "", y = percentage, fill = severity_label)) +
  geom_bar(stat = "identity", width = 1, alpha = 0.7) +
  coord_polar(theta = "y") +
  facet_wrap(~ gender_label) +  # Use the recoded gender labels
  geom_text(aes(label = paste0(round(percentage, 1), "%")), 
            position = position_stack(vjust = 0.5), size = 4) + # Add percentage labels
  theme_minimal() +
  labs(
    title = "Crime Severity Distribution by Gender",
    x = NULL,
    y = NULL,
    fill = "Crime Severity"
  ) +
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank()
  )

# Filter data to remove invalid or missing race entries
clean_data <- data %>%
  filter(!is.na(vict_descent) & vict_descent != "-")

# Map race codes to full descriptions and group small groups as "Others"
clean_data <- clean_data %>%
  mutate(
    vict_descent = recode(vict_descent,
                          "B" = "Black",       # Map "B" to "Black"
                          "H" = "Hispanic",    # Map "H" to "Hispanic"
                          "W" = "White",       # Map "W" to "White"
                          "X" = "Unknown",     # Map "X" to "Unknown"
                          "O" = "Others",      # Map "O" to "Others"
                          .default = "Others") # Group any unspecified codes as "Others"
  )

# Step 3: Calculate the proportion of each race
race_distribution <- clean_data %>%
  group_by(vict_descent) %>%
  summarise(count = n(), .groups = "drop") %>%          
  mutate(percentage = count / sum(count) * 100) %>%       
  mutate(vict_descent = ifelse(percentage < 5 | vict_descent == "Others", 
                               "Others", vict_descent)) %>% 
  # Merge small groups (<5%) into "Others"
  group_by(vict_descent) %>%
  summarise(count = sum(count), percentage = sum(percentage), .groups = "drop") 
# Recalculate totals

# Create the pie chart
ggplot(race_distribution, aes(x = "", y = percentage, fill = vict_descent)) +
  geom_bar(stat = "identity", width = 1, alpha = 0.7) + 
  coord_polar(theta = "y") +               
  theme_minimal() +                        
  labs(
    title = "Racial Distribution of Victims", 
    x = NULL,                                
    y = NULL,                                
    fill = "Race"                       
  ) +
  geom_text(aes(label = paste0(round(percentage, 1), "%")), 
            position = position_stack(vjust = 0.5), size = 4) + 
  theme(
    axis.text = element_blank(),  
    axis.ticks = element_blank(), 
    panel.grid = element_blank()  
  )

# crime severity with age and gender
ggplot(data[!is.na(data$vict_age), ], aes(x = age_group, fill = severity_label)) +
  geom_bar(position = "dodge", alpha = 0.7) +
  facet_wrap(~ vict_sex) +
  theme_minimal() +
  labs(
    title = "Crime Severity Distribution by Age Group and Gender",
    x = "Age Group",
    y = "Count",
    fill = "Crime Severity"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Crime Severity with Age Group and Race
ggplot(data[!is.na(data$vict_age) & !is.na(data$vict_descent) & data$vict_descent != "-", ], 
       aes(x = age_group, fill = severity_label)) +
  geom_bar(position = "fill", alpha = 0.7) +
  facet_wrap(~ vict_descent) +
  theme_minimal() +
  labs(title = "Crime Severity by Age Group and Race", 
       x = "Age Group", 
       y = "Proportion", 
       fill = "Crime Severity") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Create a boxplot showing age distribution by area
box_age_area <- ggplot(data, aes(x = area_name, y = vict_age, fill = area_name)) +
  geom_boxplot(outlier.color = "black", outlier.size = 0.5, alpha = 0.7) + # Boxplot with outliers
  theme_minimal() +
  labs(
    title = "Age Distribution by Area",
    x = "Area",
    y = "Victim Age",
    fill = "Area"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for readability
    legend.position = "none"                           # Remove legend (redundant with x-axis)
  )

# Display the plot
ggplotly(box_age_area)